In [23]:

    
%reload_ext autoreload
%autoreload 2
%matplotlib inline

import matplotlib.pyplot as plt
import seaborn as sns
sns.set(context="notebook", style="white", palette=sns.color_palette("RdBu"))

import numpy as np
import pandas as pd
import scipy.io as sio

import sys
sys.path.append('..')

from helper import recommender as rcmd

load data and setting up

% Notes: X - num_movies (1682) x num_features (10) matrix of movie features % Theta - num_users (943) x num_features (10) matrix of user features % Y - num_movies x num_users matrix of user ratings of movies % R - num_movies x num_users matrix, where R(i, j) = 1 if the % i-th movie was rated by the j-th user



In [24]:

    
movies_mat = sio.loadmat('./data/ex8_movies.mat')
Y, R = movies_mat.get('Y'), movies_mat.get('R')

Y.shape, R.shape









    Out[24]:





((1682, 943), (1682, 943))



In [25]:

    
m, u = Y.shape
# m: how many movies
# u: how many users

n = 10  # how many features for a movie



In [26]:

    
param_mat = sio.loadmat('./data/ex8_movieParams.mat')
theta, X = param_mat.get('Theta'), param_mat.get('X')

theta.shape, X.shape









    Out[26]:





((943, 10), (1682, 10))

cost



In [27]:

    
# use subset of data to calculate the cost as in pdf...
users = 4
movies = 5
features = 3

X_sub = X[:movies, :features]
theta_sub = theta[:users, :features]
Y_sub = Y[:movies, :users]
R_sub = R[:movies, :users]

param_sub = rcmd.serialize(X_sub, theta_sub)

rcmd.cost(param_sub, Y_sub, R_sub, features)









    Out[27]:





22.224603725685675



In [28]:

    
param = rcmd.serialize(X, theta)  # total real params

rcmd.cost(rcmd.serialize(X, theta), Y, R, 10)  # this is real total cost









    Out[28]:





27918.64012454421

gradient



In [29]:

    
n_movie, n_user = Y.shape

X_grad, theta_grad = rcmd.deserialize(rcmd.gradient(param, Y, R, 10),
                                      n_movie, n_user, 10)



In [30]:

    
assert X_grad.shape == X.shape
assert theta_grad.shape == theta.shape

regularized cost



In [31]:

    
# in the ex8_confi.m, lambda = 1.5, and it's using sub data set
rcmd.regularized_cost(param_sub, Y_sub, R_sub, features, l=1.5)









    Out[31]:





31.344056244274221



In [32]:

    
rcmd.regularized_cost(param, Y, R, 10, l=1)  # total regularized cost









    Out[32]:





32520.682450229557

regularized gradient



In [33]:

    
n_movie, n_user = Y.shape

X_grad, theta_grad = rcmd.deserialize(rcmd.regularized_gradient(param, Y, R, 10),
                                                                n_movie, n_user, 10)

assert X_grad.shape == X.shape
assert theta_grad.shape == theta.shape

parse `movie_id.txt`



In [34]:

    
movie_list = []

with open('./data/movie_ids.txt', encoding='latin-1') as f:
    for line in f:
        tokens = line.strip().split(' ')
        movie_list.append(' '.join(tokens[1:]))

movie_list = np.array(movie_list)

reproduce my ratings



In [35]:

    
ratings = np.zeros(1682)

ratings[0] = 4
ratings[6] = 3
ratings[11] = 5
ratings[53] = 4
ratings[63] = 5
ratings[65] = 3
ratings[68] = 5
ratings[97] = 2
ratings[182] = 4
ratings[225] = 5
ratings[354] = 5

prepare data



In [36]:

    
Y, R = movies_mat.get('Y'), movies_mat.get('R')


Y = np.insert(Y, 0, ratings, axis=1)  # now I become user 0
Y.shape









    Out[36]:





(1682, 944)



In [37]:

    
R = np.insert(R, 0, ratings != 0, axis=1)
R.shape









    Out[37]:





(1682, 944)



In [58]:

    
n_features = 50
n_movie, n_user = Y.shape
l = 10



In [59]:

    
X = np.random.standard_normal((n_movie, n_features))
theta = np.random.standard_normal((n_user, n_features))

X.shape, theta.shape









    Out[59]:





((1682, 50), (944, 50))



In [60]:

    
param = rcmd.serialize(X, theta)

normalized ratings



In [61]:

    
Y_norm = Y - Y.mean()
Y_norm.mean()









    Out[61]:





4.6862111343939375e-17

training



In [62]:

    
import scipy.optimize as opt



In [63]:

    
res = opt.minimize(fun=rcmd.regularized_cost,
                   x0=param,
                   args=(Y_norm, R, n_features, l),
                   method='TNC',
                   jac=rcmd.regularized_gradient)



In [64]:

    
res









    Out[64]:





     fun: 64721.49781507616
     jac: array([ -1.58635281e-06,  -4.38201786e-08,  -8.32037088e-07, ...,
         1.99915339e-07,   4.98546832e-07,  -2.27175198e-07])
 message: 'Converged (|f_n-f_(n-1)| ~= 0)'
    nfev: 2505
     nit: 74
  status: 1
 success: True
       x: array([  7.66989411e-01,   7.56774960e-01,   1.00894310e+00, ...,
         8.16557591e-01,   1.72679929e-04,   1.09739346e-01])



In [65]:

    
X_trained, theta_trained = rcmd.deserialize(res.x, n_movie, n_user, n_features)
X_trained.shape, theta_trained.shape









    Out[65]:





((1682, 50), (944, 50))



In [66]:

    
prediction = X_trained @ theta_trained.T



In [67]:

    
my_preds = prediction[:, 0] + Y.mean()



In [68]:

    
idx = np.argsort(my_preds)[::-1]  # Descending order
idx.shape









    Out[68]:





(1682,)



In [69]:

    
# top ten idx
my_preds[idx][:10]









    Out[69]:





array([ 4.12534978,  4.04414835,  3.99324636,  3.91902945,  3.81691251,
        3.81556458,  3.76602976,  3.76323186,  3.75906567,  3.75077289])



In [70]:

    
for m in movie_list[idx][:10]:
    print(m)









    



Titanic (1997)
Star Wars (1977)
Shawshank Redemption, The (1994)
Forrest Gump (1994)
Raiders of the Lost Ark (1981)
Braveheart (1995)
Return of the Jedi (1983)
Usual Suspects, The (1995)
Godfather, The (1972)
Schindler's List (1993)



In [ ]: